{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from pandas import Series, DataFrame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>dateofdeath</th>\n",
       "      <th>age</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2016-01-01</td>\n",
       "      <td>71</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2016-01-01</td>\n",
       "      <td>74</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2016-01-01</td>\n",
       "      <td>79</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2016-01-01</td>\n",
       "      <td>45</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2016-01-01</td>\n",
       "      <td>83</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  dateofdeath age\n",
       "0  2016-01-01  71\n",
       "1  2016-01-01  74\n",
       "2  2016-01-01  79\n",
       "3  2016-01-01  45\n",
       "4  2016-01-01  83"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "filename = '../data/celebrity_deaths_2016.csv'\n",
    "\n",
    "df = pd.read_csv(filename,\n",
    "                usecols=['dateofdeath', 'age'])\n",
    "\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Clean all non-integers from the \"age\" column\n",
    "# (1) Remove all NaNs\n",
    "df = df.dropna(subset=['age'])\n",
    "\n",
    "# (2) Remove all non-digits\n",
    "df = df[df['age'].str.isdigit()]\n",
    "df['age'] = df['age'].astype(np.int64)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Beyond 1\n",
    "\n",
    "Add a new column, `day`, from the day of the month in which the celebrity died. Then create a multi-index (from `month` and `day`). What was the average age of death from Feb. 15th through July 15th?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "77.05183037332367"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Get the month, in slice [5:7]\n",
    "df['month'] = df['dateofdeath'].str.slice(5,7)\n",
    "\n",
    "# Get the day, in slice [8:]\n",
    "df['day'] = df['dateofdeath'].str.slice(8,None)\n",
    "\n",
    "# Set a multi-index\n",
    "df = df.set_index(['month', 'day'])\n",
    "\n",
    "# Sort the index\n",
    "df = df.sort_index()\n",
    "\n",
    "# Get the rows from Feb 15th through July 15th, and the 'age' column, then the average\n",
    "df.loc[('02', '15'):('07', '15'), 'age'].mean()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Beyond 2\n",
    "\n",
    "The CSV file contains another column, `causeofdeath`. Load that into a data frame, and find the five most common causes of death. Now replace any `NaN` values in that column with the string `'unknown'`, and again find the five most common causes of death."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "causeofdeath\n",
       " cancer               248\n",
       " heart attack         125\n",
       " traffic collision     56\n",
       " lung cancer           51\n",
       " pneumonia             50\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "filename = '../data/celebrity_deaths_2016.csv'\n",
    "\n",
    "df = pd.read_csv(filename,\n",
    "                usecols=['dateofdeath', 'age', 'causeofdeath'])\n",
    "\n",
    "# get the five most common causes of death\n",
    "df['causeofdeath'].value_counts().head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "causeofdeath\n",
       "unknown               5008\n",
       " cancer                248\n",
       " heart attack          125\n",
       " traffic collision      56\n",
       " lung cancer            51\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Replace NaN with 'unknown'... and we get more than 5,000 such rows.\n",
    "# This data set isn't very reliable when it comes to causes of death! \n",
    "df['causeofdeath'] = df['causeofdeath'].fillna('unknown')\n",
    "df['causeofdeath'].value_counts().head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Beyond 3\n",
    "\n",
    "If someone asks whether cancer is in the top 10 causes, what would you say? Can we be more specific than that?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "causeofdeath\n",
       "unknown               5008\n",
       " cancer                248\n",
       " heart attack          125\n",
       " traffic collision      56\n",
       " lung cancer            51\n",
       " pneumonia              50\n",
       " heart failure          49\n",
       " shot                   42\n",
       " stroke                 36\n",
       " pancreatic cancer      35\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# we see that there is general \"cancer,\" but also \"lung cancer\" and \"pancreatic cancer.\"\n",
    "\n",
    "# It's impossible to know whether just \"cancer\" means \"other cancer,\" or that it wasn't\n",
    "# classified well, or somethign else.\n",
    "\n",
    "# Basically, this is an instructive data set because it is not very reliable, at least\n",
    "# when it comes to causes of death. We would want something more rigorous in making serious decisions.\n",
    "df['causeofdeath'].value_counts().head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
